--- title: "Time Series in R" author: "Arvind Venkatadri" date: "2022-12-12" output: rmdformats::readthedown: highlight: kate toc: TRUE toc_float: TRUE toc_depth: 2 df_print: paged number_sections: TRUE code_folding: hide code_download: TRUE editor_options: markdown: wrap: 72 ---
We will Analyze a Time Series Dataset in R.
timetkLet us inspect what datasets are available in the package timetk. Type
data(package = "timetk") in your Console to see what datasets are
available.
Let us choose the Walmart Sales dataset. See here for more details: Walmart Recruiting - Store Sales Forecasting |Kaggle
data("walmart_sales_weekly")
walmart_sales_weekly
#> # A tibble: 1,001 × 17
#> id Store Dept Date Weekly…¹ IsHol…² Type Size Tempe…³ Fuel_…⁴ MarkD…⁵ MarkD…⁶ MarkD…⁷ MarkD…⁸ MarkD…⁹ CPI Unemp…˟
#> <fct> <dbl> <dbl> <date> <dbl> <lgl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1_1 1 1 2010-02-05 24924. FALSE A 151315 42.3 2.57 NA NA NA NA NA 211. 8.11
#> 2 1_1 1 1 2010-02-12 46039. TRUE A 151315 38.5 2.55 NA NA NA NA NA 211. 8.11
#> 3 1_1 1 1 2010-02-19 41596. FALSE A 151315 39.9 2.51 NA NA NA NA NA 211. 8.11
#> 4 1_1 1 1 2010-02-26 19404. FALSE A 151315 46.6 2.56 NA NA NA NA NA 211. 8.11
#> # … with 997 more rows, and abbreviated variable names ¹Weekly_Sales, ²IsHoliday, ³Temperature, ⁴Fuel_Price, ⁵MarkDown1,
#> # ⁶MarkDown2, ⁷MarkDown3, ⁸MarkDown4, ⁹MarkDown5, ˟Unemployment
inspect(walmart_sales_weekly)
#>
#> categorical variables:
#> name class levels n missing distribution
#> 1 id factor 3331 1001 0 1_1 (14.3%), 1_3 (14.3%), 1_8 (14.3%) ...
#> 2 IsHoliday logical 2 1001 0 FALSE (93%), TRUE (7%)
#> 3 Type character 1 1001 0 A (100%)
#>
#> Date variables:
#> name class first last min_diff max_diff n missing
#> 1 Date Date 2010-02-05 2012-10-26 0 days 7 days 1001 0
#>
#> quantitative variables:
#> name class min Q1 median Q3 max mean sd n missing
#> 1 Store numeric 1.0000 1.0000 1.0000 1.0000 1.0000 1.000000e+00 0.000000e+00 1001 0
#> 2 Dept numeric 1.0000 3.0000 13.0000 93.0000 95.0000 3.585714e+01 3.849159e+01 1001 0
#> 3 Weekly_Sales numeric 6165.7300 28257.3000 39886.0600 77943.5700 148798.0500 5.464634e+04 3.627627e+04 1001 0
#> 4 Size numeric 151315.0000 151315.0000 151315.0000 151315.0000 151315.0000 1.513150e+05 0.000000e+00 1001 0
#> 5 Temperature numeric 35.4000 57.7900 69.6400 80.4900 91.6500 6.830678e+01 1.420767e+01 1001 0
#> 6 Fuel_Price numeric 2.5140 2.7590 3.2900 3.5940 3.9070 3.219699e+00 4.260286e-01 1001 0
#> 7 MarkDown1 numeric 410.3100 4039.3900 6154.1400 10121.9700 34577.0600 8.090766e+03 6.550983e+03 357 644
#> 8 MarkDown2 numeric 0.5000 40.4800 144.8700 1569.0000 46011.3800 2.941315e+03 7.873661e+03 294 707
#> 9 MarkDown3 numeric 0.2500 6.0000 25.9650 101.6400 55805.5100 1.225400e+03 7.811934e+03 350 651
#> 10 MarkDown4 numeric 8.0000 577.1400 1822.5500 3750.5900 32403.8700 3.746085e+03 5.948867e+03 357 644
#> 11 MarkDown5 numeric 554.9200 3127.8800 4325.1900 6222.2500 20475.3200 5.018655e+03 3.254071e+03 357 644
#> 12 CPI numeric 210.3374 211.5312 215.4599 220.6369 223.4443 2.159969e+02 4.337818e+00 1001 0
#> 13 Unemployment numeric 6.5730 7.3480 7.7870 7.8380 8.1060 7.610420e+00 3.825958e-01 1001 0
glimpse(walmart_sales_weekly)
#> Rows: 1,001
#> Columns: 17
#> $ id <fct> 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_1, 1_…
#> $ Store <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ Dept <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#> $ Date <date> 2010-02-05, 2010-02-12, 2010-02-19, 2010-02-26, 2010-03-05, 2010-03-12, 2010-03-19, 2010-03-26, 2010-04-02…
#> $ Weekly_Sales <dbl> 24924.50, 46039.49, 41595.55, 19403.54, 21827.90, 21043.39, 22136.64, 26229.21, 57258.43, 42960.91, 17596.9…
#> $ IsHoliday <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
#> $ Type <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
#> $ Size <dbl> 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151315, 151…
#> $ Temperature <dbl> 42.31, 38.51, 39.93, 46.63, 46.50, 57.79, 54.58, 51.45, 62.27, 65.86, 66.32, 64.84, 67.41, 72.55, 74.78, 76…
#> $ Fuel_Price <dbl> 2.572, 2.548, 2.514, 2.561, 2.625, 2.667, 2.720, 2.732, 2.719, 2.770, 2.808, 2.795, 2.780, 2.835, 2.854, 2.…
#> $ MarkDown1 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ MarkDown2 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ MarkDown3 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ MarkDown4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ MarkDown5 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ CPI <dbl> 211.0964, 211.2422, 211.2891, 211.3196, 211.3501, 211.3806, 211.2156, 211.0180, 210.8204, 210.6229, 210.488…
#> $ Unemployment <dbl> 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 7.808, 7.808, 7.808, 7.808, 7.808, 7.808, 7.808, 7.…
# Try this in your Console
# help("walmart_sales_weekly")
The data is described as:
A tibble: 9,743 x 3
idFactor. Unique series identifier (4 total)StoreNumeric. Store ID.DeptNumeric. Department ID.DateDate. Weekly timestamp.Weekly_SalesNumeric. Sales for the given department in the given store.IsHolidayLogical. Whether the week is a “special” holiday for the store.TypeCharacter. Type identifier of the store.SizeNumeric. Store square-footageTemperatureNumeric. Average temperature in the region.Fuel_PriceNumeric. Cost of fuel in the region.MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5Numeric. Anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA.CPINumeric. The consumer price index.UnemploymentNumeric. The unemployment rate in the region.
Very cool to know that mosaic::inspect() identifies date variables
separately!
NOTE:
1. This is still a data.frame, with a time-oriented variable of course,
and not yet a time-series object. Note that this data frame has the YMD columns repeated for each Dept.
2. The Date column has repeated entries for each Dept! To deal with this repetition, we will always need to split the Weekly_Sales by the Dept column before we plot or analyze.
Since our sales are weekly, we will convert Date to yearweek format:
walmart_time <- walmart_sales_weekly %>%
# mutate(Date = as.Date(Date)) %>%
as_tsibble(index = Date, # Time Variable
key = Dept
# Identifies unique "subject" who are measures
# All other variables such as Weekly_sales become "measured variable"
# Each observation should be uniquely identified by index and key
)
walmart_time
#> # A tsibble: 1,001 x 17 [7D]
#> # Key: Dept [7]
#> id Store Dept Date Weekly…¹ IsHol…² Type Size Tempe…³ Fuel_…⁴ MarkD…⁵ MarkD…⁶ MarkD…⁷ MarkD…⁸ MarkD…⁹ CPI Unemp…˟
#> <fct> <dbl> <dbl> <date> <dbl> <lgl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1_1 1 1 2010-02-05 24924. FALSE A 151315 42.3 2.57 NA NA NA NA NA 211. 8.11
#> 2 1_1 1 1 2010-02-12 46039. TRUE A 151315 38.5 2.55 NA NA NA NA NA 211. 8.11
#> 3 1_1 1 1 2010-02-19 41596. FALSE A 151315 39.9 2.51 NA NA NA NA NA 211. 8.11
#> 4 1_1 1 1 2010-02-26 19404. FALSE A 151315 46.6 2.56 NA NA NA NA NA 211. 8.11
#> # … with 997 more rows, and abbreviated variable names ¹Weekly_Sales, ²IsHoliday, ³Temperature, ⁴Fuel_Price, ⁵MarkDown1,
#> # ⁶MarkDown2, ⁷MarkDown3, ⁸MarkDown4, ⁹MarkDown5, ˟Unemployment
The easiest way is to use autoplot from the feasts package. You may need to specify the actual measured variable, if there is more than one numerical column:
autoplot(walmart_time,.vars = Weekly_Sales)

timetk gives us interactive plots that may be more evocative than
the static plot above. The basic plot function with timetk is
plot_time_series. There are arguments for the date variable, the value
you want to plot, colours, groupings etc.
Let us explore this dataset using timetk, using our trusted method of
asking Questions:
Q.1 How are the weekly sales different for each Department?
There are 7 number of Departments. So we should be fine plotting them and also facetting with them, as we will see in a bit:
walmart_time %>% timetk::plot_time_series(Date, Weekly_Sales,
.color_var = Dept, .legend_show = TRUE,
.title = "Walmart Sales Data by Department",
.smooth = FALSE)
Q.2. What do the sales per Dept look like during the month of December (Christmas time) in 2012? Show the individual Depts as facets.
We can of course zoom into the interactive plot above, but if we were to plot it anyway:
# Only include rows from 1 to December 31, 2011
# Data goes only up to Oct 2012
walmart_time %>%
# Each side of the time_formula is specified as the character 'YYYY-MM-DD HH:MM:SS',
timetk::filter_by_time(.date_var = Date,
.start_date = "2011-12-01",
.end_date = "2011-12-31") %>%
plot_time_series(.date_var = Date,
.value = Weekly_Sales,
.color_var = Dept,
.facet_vars = Dept,
.facet_ncol = 2,
.smooth = FALSE) # Only 4 points per graph
Clearly the “unfortunate” Dept#13 has seen something of a Christmas drop in sales, as has Dept#38 ! The rest, all is well, it seems…
Q.3 How do we smooth out some of the variations in the time series to be able to understand it better?
Sometimes there is too much noise in the time series observations and we
want to take what is called a rolling average. For this we will use
the function timetk::slidify to create an averaging function of our
choice, and then apply it to the time series using regular
dplyr::mutate
# Let's take the average of Sales for each month in each Department.
# Our **function** will be named "rolling_avg_month":
rolling_avg_month = slidify(.period = 4, # every 4 weeks
.f = mean, # The funtion to average
.align = "center", # Aligned with middle of month
.partial = TRUE) # TO catch any leftover half weeks
rolling_avg_month
#> function (...)
#> {
#> slider_2(..., .slider_fun = slider::pslide, .f = .f, .period = .period,
#> .align = .align, .partial = .partial, .unlist = .unlist)
#> }
#> <bytecode: 0x000002b5ca1b5ff0>
#> <environment: 0x000002b5ca1b56c0>
OK, slidify creates a function! Let’s apply it to the Walmart Sales
time series…
walmart_time %>%
# group_by(Dept) %>%
mutate(avg_monthly_sales = rolling_avg_month(Weekly_Sales)) %>%
# ungroup() %>%
timetk::plot_time_series(Date, avg_monthly_sales,.color_var = Dept, .smooth = FALSE)
Curves are smoother now. Need to check whether the averaging was done
on a per-Dept basis…should we have had a group_by(Dept) before the
averaging, and ungroup() before plotting? Try it !!
Each data point (\(Y_t\)) at time \(t\) in a Time Series can be expressed as either a sum or a product of 4 components, namely, Seasonality(\(S_t\)), Trend(\(T_t\)), Cyclic, and Error(\(e_t\)) (a.k.a White Noise).
Decomposing non-seasonal datas means breaking it up into trend and irregular components.To estimate the trend component of a non-seasonal time series that can be described using an additive model, it is common to use a smoothing method, such as calculating the simple moving average of the time series.
timetk has the ability to achieve this: Let us plot the trend, seasonal, cyclic and irregular aspects of Weekly_Sales for Dept 38:
walmart_time %>%
filter(Dept == "38") %>%
timetk::plot_stl_diagnostics(.date_var = Date, .value = Weekly_Sales)
#> frequency = 13 observations per 1 quarter
#> trend = 52 observations per 1 year
We can do this for all Dept using fable and fabletools:
walmart_decomposed <-
walmart_time %>%
# If we want to filter, we do it here
# filter(Dept == "38") %>%
#
fabletools::model(stl = STL(Weekly_Sales))
fabletools::components(walmart_decomposed)
#> # A dable: 1,001 x 8 [7D]
#> # Key: Dept, .model [7]
#> # : Weekly_Sales = trend + season_year + remainder
#> Dept .model Date Weekly_Sales trend season_year remainder season_adjust
#> <dbl> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 stl 2010-02-05 24924. 23138. 469. 1318. 24456.
#> 2 1 stl 2010-02-12 46039. 23129. 17545. 5366. 28495.
#> 3 1 stl 2010-02-19 41596. 23120. 24371. -5895. 17225.
#> 4 1 stl 2010-02-26 19404. 23111. -3270. -437. 22673.
#> # … with 997 more rows
autoplot(components((walmart_decomposed)))

nycflights13Let us try the flights dataset from the package nycflights13. Try
data(package = "nycflights13") in your Console.
We have the following datasets innycflights13:
Data sets in package nycflights13:
airlines Airline names.airports Airport metadataflights Flights dataplanes Plane metadata.weather Hourly weather dataLet us analyze the flights data:
data("flights", package = "nycflights13")
mosaic::inspect(flights)
#>
#> categorical variables:
#> name class levels n missing distribution
#> 1 carrier character 16 336776 0 UA (17.4%), B6 (16.2%), EV (16.1%) ...
#> 2 tailnum character 4043 334264 2512 N725MQ (0.2%), N722MQ (0.2%) ...
#> 3 origin character 3 336776 0 EWR (35.9%), JFK (33%), LGA (31.1%)
#> 4 dest character 105 336776 0 ORD (5.1%), ATL (5.1%), LAX (4.8%) ...
#>
#> quantitative variables:
#> name class min Q1 median Q3 max mean sd n missing
#> 1 year integer 2013 2013 2013 2013 2013 2013.000000 0.000000 336776 0
#> 2 month integer 1 4 7 10 12 6.548510 3.414457 336776 0
#> 3 day integer 1 8 16 23 31 15.710787 8.768607 336776 0
#> 4 dep_time integer 1 907 1401 1744 2400 1349.109947 488.281791 328521 8255
#> 5 sched_dep_time integer 106 906 1359 1729 2359 1344.254840 467.335756 336776 0
#> 6 dep_delay numeric -43 -5 -2 11 1301 12.639070 40.210061 328521 8255
#> 7 arr_time integer 1 1104 1535 1940 2400 1502.054999 533.264132 328063 8713
#> 8 sched_arr_time integer 1 1124 1556 1945 2359 1536.380220 497.457142 336776 0
#> 9 arr_delay numeric -86 -17 -5 14 1272 6.895377 44.633292 327346 9430
#> 10 flight integer 1 553 1496 3465 8500 1971.923620 1632.471938 336776 0
#> 11 air_time numeric 20 82 129 192 695 150.686460 93.688305 327346 9430
#> 12 distance numeric 17 502 872 1389 4983 1039.912604 733.233033 336776 0
#> 13 hour numeric 1 9 13 17 23 13.180247 4.661316 336776 0
#> 14 minute numeric 0 8 29 44 59 26.230100 19.300846 336776 0
#>
#> time variables:
#> name class first last min_diff max_diff n missing
#> 1 time_hour POSIXct 2013-01-01 05:00:00 2013-12-31 23:00:00 0 secs 25200 secs 336776 0
We have time-related columns; Apart from year, month, day we have
time_hour; and time-event numerical data such as arr_delay (arrival
delay) and dep_delay (departure delay). We also have categorical data
such as carrier, origin, dest, flight and tailnum of the aircraft. It is
also a large dataset containing 330K entries. Enough to play with!!
Let us replace the NAs in arr_delay and dep_delay with zeroes for
now, and convert it into a time-series object with tsibble:
flights_delay_ts <- flights %>%
mutate(arr_delay = replace_na(arr_delay, 0),
dep_delay = replace_na(dep_delay, 0)) %>%
select(time_hour, arr_delay, dep_delay, carrier, origin, dest, flight, tailnum) %>%
tsibble::as_tsibble(index = time_hour,
# All the remaining identify unique entries
# Along with index
# Many of these variables are common
# Need *all* to make unique entries!
key = c(carrier, origin, dest,flight, tailnum),
validate = TRUE) # Making sure each entry is unique
flights_delay_ts
#> # A tsibble: 336,776 x 8 [1h] <America/New_York>
#> # Key: carrier, origin, dest, flight, tailnum [186,870]
#> time_hour arr_delay dep_delay carrier origin dest flight tailnum
#> <dttm> <dbl> <dbl> <chr> <chr> <chr> <int> <chr>
#> 1 2013-05-04 07:00:00 -11 -6 9E EWR ATL 3633 N170PQ
#> 2 2013-05-01 11:00:00 -24 -5 9E EWR ATL 4194 N170PQ
#> 3 2013-05-03 09:00:00 15 -6 9E EWR ATL 4362 N170PQ
#> 4 2013-05-02 09:00:00 -5 -6 9E EWR ATL 4362 N232PQ
#> # … with 336,772 more rows
Q.1. Plot the monthly average arrival delay by carrier
mean_arr_delays_by_carrier <-
flights_delay_ts %>%
group_by(carrier) %>%
index_by(month = ~ yearmonth(.)) %>%
# index_by uses (year, yearquarter, yearmonth, yearweek, as.Date)
# to create a new column to show the time-grouping
# year / quarter / month/ week, or day...
# which IS different from traditional dplyr
summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)
)
mean_arr_delays_by_carrier
#> # A tsibble: 185 x 3 [1M]
#> # Key: carrier [16]
#> carrier month mean_arr_delay
#> <chr> <mth> <dbl>
#> 1 9E 2013 Jan 9.60
#> 2 9E 2013 Feb 7.61
#> 3 9E 2013 Mar 1.89
#> 4 9E 2013 Apr 5.07
#> # … with 181 more rows
mean_arr_delays_by_carrier %>%
timetk::plot_time_series(
.date_var = month,
.value = mean_arr_delay,
.facet_vars = carrier,
.smooth = FALSE,
# .smooth_degree = 1,
# keep .smooth off since it throws warnings if there are too few points
# Like if we do quarterly or even yearly summaries
# Use only for smaller values of .smooth_degree (0,1)
#
.facet_ncol = 4,
.title = "Average Monthly Arrival Delays by Carrier"
)
Q.2. Plot a candlestick chart for total flight delays by month for each carrier
flights_delay_ts %>%
mutate(total_delay = arr_delay + dep_delay) %>%
timetk::plot_time_series_boxplot(
.date_var = time_hour,
.value = total_delay,
.color_var = origin,
.facet_vars = origin,
.period = "month",
# same warning again
.smooth = FALSE
)